library(tidyverse)
library(janitor)
library(readxl)
library(usmap)
library(stargazer)
library(haven)
library(tigris)
library(rio)
library(standardize)
library(effsize)
library(ggthemes)
# Controls v2 from ACS
controls_2019 <- read_csv("controls_2019_v2.csv") %>%
mutate(total_pop = SE_A00001_001) %>%
mutate(percent_white = SE_A03001_002/total_pop) %>%
mutate(percent_bachelors = SE_B12001_004/SE_B12001_001) %>%
mutate(med_household_income = SE_A14006_001) %>%
mutate(percent_hispanic = SE_B04001_010/total_pop) %>%
select(Geo_FIPS, Geo_NAME, Geo_STATE, Geo_COUNTY, total_pop,
percent_white, percent_bachelors, med_household_income,
percent_hispanic) %>%
mutate(fips = as.numeric(Geo_FIPS))
cbp_2018 <- read_csv("cbp_2018.csv") %>%
mutate(avg_wage = SE_T1200_001) %>%
mutate(fips = as.numeric(Geo_FIPS)) %>%
select(fips, avg_wage)
controls_2019 <- left_join(controls_2019, cbp_2018, by = "fips")
# Other Controls
poverty <- read_excel("PovertyEstimates.xls") %>%
row_to_names(4) %>%
clean_names() %>%
mutate(fips = as.numeric(fip_stxt),
poverty = as.numeric(pctpovall_2019)/100) %>%
select(fips, poverty)
unemployment <- read_excel("Unemployment.xlsx") %>%
row_to_names(4) %>%
clean_names() %>%
mutate(fips = as.numeric(fips_code),
unemp = as.numeric(unemployment_rate_2019)/100) %>%
select(fips, unemp)
controls_2019 <- controls_2019 %>%
left_join(poverty, by = "fips") %>%
left_join(unemployment, by = "fips")
# Import county migration data
c2c1519 <- import_list("c2c1519.xlsx")
c2c1216 <- import_list("c2c1216.xlsx")
c2c0812 <- import_list("c2c0812.xls")
# Rename columns
for (i in 1:52){
colnames(c2c1519[[i]]) <- c("state_code_a",
"fips_a",
"state_code_b",
"fips_b",
"state_name_a",
"county_name_a",
"state_name_b",
"county_name_b",
"flow_ba",
"flow_ba_moe",
"flow_ab",
"flow_ab_moe",
"net_ba",
"net_ba_moe",
"gross_ab",
"gross_ab_moe")
colnames(c2c1216[[i]]) <- c("state_code_a",
"fips_a",
"state_code_b",
"fips_b",
"state_name_a",
"county_name_a",
"state_name_b",
"county_name_b",
"flow_ba",
"flow_ba_moe",
"flow_ab",
"flow_ab_moe",
"net_ba",
"net_ba_moe",
"gross_ab",
"gross_ab_moe")
colnames(c2c0812[[i]]) <- c("state_code_a",
"fips_a",
"state_code_b",
"fips_b",
"state_name_a",
"county_name_a",
"state_name_b",
"county_name_b",
"flow_ba",
"flow_ba_moe",
"flow_ab",
"flow_ab_moe",
"net_ba",
"net_ba_moe",
"gross_ab",
"gross_ab_moe")
}
# Delete extraneous rows
for (i in 1:52){
c2c1519[[i]] <- c2c1519[[i]] %>%
filter(!row_number() %in% c(1, 2))
c2c1216[[i]] <- c2c1216[[i]] %>%
filter(!row_number() %in% c(1, 2))
c2c0812[[i]] <- c2c0812[[i]] %>%
filter(!row_number() %in% c(1, 2))
}
# Bind datasets together by year
c2c1519 <- bind_rows(c2c1519)
c2c1216 <- bind_rows(c2c1216)
c2c0812 <- bind_rows(c2c0812)
# Combine county migration data with controls and election data
df_2020 <- c2c1519 %>%
filter(is.na(fips_b) == FALSE) %>%
mutate(comb_fips_a = paste0(state_code_a, fips_a)) %>%
group_by(county_name_a, comb_fips_a) %>%
summarize(flow_ba = sum(as.numeric(flow_ba))) %>%
arrange(comb_fips_a) %>%
mutate(fips = as.numeric(comb_fips_a))
countypres2020 <- read_csv("countypres_2000-2020.csv") %>%
filter(year == 2020) %>%
filter(party == "DEMOCRAT" | party == "REPUBLICAN") %>%
group_by(county_fips) %>%
summarize(total_2p_votes = sum(candidatevotes)) %>%
mutate(fips = as.numeric(county_fips)) %>%
select(fips, total_2p_votes)
countypres2020_2p <- read_csv("countypres_2000-2020.csv") %>%
filter(year == 2020) %>%
filter(party == "DEMOCRAT") %>%
mutate(fips = as.numeric(county_fips)) %>%
group_by(fips) %>%
summarize(candidatevotes = sum(candidatevotes)) %>%
left_join(countypres2020, by = "fips") %>%
mutate(dem_percent = candidatevotes/total_2p_votes)
countypres2020_2p_rep <- read_csv("countypres_2000-2020.csv") %>%
filter(year == 2020) %>%
filter(party == "REPUBLICAN") %>%
mutate(fips = as.numeric(county_fips)) %>%
group_by(fips) %>%
summarize(candidatevotes = sum(candidatevotes)) %>%
left_join(countypres2020, by = "fips") %>%
mutate(rep_percent = candidatevotes/total_2p_votes) %>%
select(fips, rep_percent)
df_2020_election <- left_join(df_2020, countypres2020_2p, by = "fips")
df_2020_combined <- left_join(df_2020_election, controls_2019, by = "fips") %>%
mutate(avg_immi_rate = flow_ba/total_pop)
df <- df_2020_combined %>%
filter(Geo_STATE != "02" && Geo_STATE != "72")
write.csv(df, "migration_df.csv")
# Basic linear regression
stargazer(lm(dem_percent ~ avg_immi_rate, data = df_2020_combined, weight = total_pop),
type = "text")
===============================================
Dependent variable:
---------------------------
dem_percent
-----------------------------------------------
avg_immi_rate -1.542***
(0.132)
Constant 0.609***
(0.008)
-----------------------------------------------
Observations 3,113
R2 0.042
Adjusted R2 0.042
Residual Std. Error 53.990 (df = 3111)
F Statistic 136.485*** (df = 1; 3111)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
# Visualization of relationship between immigration rate and dem percent
ggplot(df_2020_combined, aes(x = avg_immi_rate, y = dem_percent, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Fall in Democratic Voting with Increase in Rate of Immigration",
x = "Immigration Rate",
y = "Two-Party Democratic Vote Share",
subtitle = "2020 Presidential Election",
caption = "Weighted by County Population")
Check for missing value in the economic data Check Alaska!! What are the patterns we expect to see if this is true, does the data look like what it would if this is happening Have a hypothesis that this should impact behavior, do you see a pattern that isn’t explained by other things Matching Viriginia reports cities and counties around them differently and doesn’t line up - drop ones that don’t align cleanly (fips codes matched things are weird, census has the full county but the data is reported split between the city and everything else) Know things about the places that people are coming from, -1, 0, 1 net partisanship of the people moving into an area, sorting, impact on people there
# Regression with controls
stargazer(lm(dem_percent ~ avg_immi_rate + percent_white + percent_hispanic +
percent_bachelors + avg_wage + poverty + unemp + med_household_income, data = df_2020_combined,
weight = total_pop),
lm(dem_percent ~ avg_immi_rate + percent_white + percent_hispanic +
percent_bachelors, data = df_2020_combined,
weight = total_pop),
lm(dem_percent ~ avg_immi_rate + avg_wage + poverty + unemp + med_household_income, data = df_2020_combined,
weight = total_pop),
type = "text")
======================================================================================================
Dependent variable:
---------------------------------------------------------------------------------
dem_percent
(1) (2) (3)
------------------------------------------------------------------------------------------------------
avg_immi_rate -0.663*** -0.668*** -0.589***
(0.072) (0.074) (0.107)
percent_white -0.476*** -0.519***
(0.011) (0.010)
percent_hispanic 0.171*** 0.173***
(0.010) (0.010)
percent_bachelors 1.098*** 0.796***
(0.024) (0.014)
avg_wage 0.00000*** 0.00001***
(0.00000) (0.00000)
poverty -0.230*** 1.547***
(0.066) (0.093)
unemp 1.742*** 0.004
(0.159) (0.244)
med_household_income -0.00000*** 0.00000***
(0.00000) (0.00000)
Constant 0.618*** 0.652*** -0.155***
(0.022) (0.011) (0.025)
------------------------------------------------------------------------------------------------------
Observations 3,112 3,113 3,112
R2 0.778 0.745 0.440
Adjusted R2 0.778 0.744 0.439
Residual Std. Error 26.005 (df = 3103) 27.879 (df = 3108) 41.318 (df = 3106)
F Statistic 1,361.864*** (df = 8; 3103) 2,267.819*** (df = 4; 3108) 487.790*** (df = 5; 3106)
======================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
# Regression standardized
df_2020_combined$dem_percent_scaled <- scale(df_2020_combined$dem_percent)[, 1]
df_2020_combined$avg_immi_rate_scaled <- scale(df_2020_combined$avg_immi_rate)[, 1]
df_2020_combined$percent_white_scaled <- scale(df_2020_combined$percent_white)[, 1]
df_2020_combined$percent_bachelors_scaled <- scale(df_2020_combined$percent_bachelors)[, 1]
df_2020_combined$med_household_income_scaled <- scale(df_2020_combined$med_household_income)[, 1]
df_2020_combined$percent_hispanic_scaled <- scale(df_2020_combined$percent_hispanic)[, 1]
df_2020_combined$avg_wage_scaled <- scale(df_2020_combined$avg_wage)[, 1]
df_2020_combined$poverty_scaled <- scale(df_2020_combined$poverty)[, 1]
df_2020_combined$unemp_scaled <- scale(df_2020_combined$unemp)[, 1]
stargazer(lm(dem_percent_scaled ~ avg_immi_rate_scaled + percent_white_scaled + percent_bachelors_scaled +
med_household_income_scaled + percent_hispanic_scaled +
avg_wage_scaled + poverty_scaled + unemp_scaled, data = df_2020_combined,
weight = total_pop), type = "text")
=======================================================
Dependent variable:
---------------------------
dem_percent_scaled
-------------------------------------------------------
avg_immi_rate_scaled -0.116***
(0.013)
percent_white_scaled -0.504***
(0.012)
percent_bachelors_scaled 0.642***
(0.014)
med_household_income_scaled -0.257***
(0.018)
percent_hispanic_scaled 0.204***
(0.011)
avg_wage_scaled 0.068***
(0.010)
poverty_scaled -0.082***
(0.023)
unemp_scaled 0.192***
(0.018)
Constant 0.239***
(0.013)
-------------------------------------------------------
Observations 3,112
R2 0.778
Adjusted R2 0.778
Residual Std. Error 160.019 (df = 3103)
F Statistic 1,361.864*** (df = 8; 3103)
=======================================================
Note: *p<0.1; **p<0.05; ***p<0.01
df_2020_emi <- c2c1519 %>%
filter(is.na(fips_b) == FALSE) %>%
mutate(comb_fips_a = paste0(state_code_a, fips_a)) %>%
group_by(county_name_a, comb_fips_a) %>%
summarize(flow_ab = sum(as.numeric(flow_ab))) %>%
arrange(comb_fips_a) %>%
mutate(fips = as.numeric(comb_fips_a))
`summarise()` has grouped output by 'county_name_a'. You can override using the `.groups` argument.
df_2020_election_emi <- left_join(df_2020_emi, countypres2020_2p, by = "fips")
df_2020_combined_emi <- left_join(df_2020_election_emi, controls_2019, by = "fips") %>%
mutate(avg_emi_rate = flow_ab/total_pop)
df_emi <- df_2020_combined_emi %>%
filter(Geo_STATE != "02") %>%
filter(Geo_STATE != "72")
write.csv(df_emi, "migration_df2.csv")
# Basic regression
stargazer(lm(dem_percent ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop), type = "text")
===============================================
Dependent variable:
---------------------------
dem_percent
-----------------------------------------------
avg_emi_rate -0.448**
(0.178)
Constant 0.549***
(0.010)
-----------------------------------------------
Observations 3,113
R2 0.002
Adjusted R2 0.002
Residual Std. Error 55.106 (df = 3111)
F Statistic 6.298** (df = 1; 3111)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
ggplot(df_2020_combined_emi, aes(x = avg_emi_rate, y = dem_percent, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Fall in Democratic Voting with Increase in Rate of Emigration",
x = "Emigration Rate",
y = "Two-Party Democratic Vote Share",
subtitle = "2020 Presidential Election",
caption = "Weighted by County Population")
stargazer(lm(dem_percent ~ avg_emi_rate + percent_white + percent_hispanic +
percent_bachelors + avg_wage + poverty + unemp + med_household_income, data = df_2020_combined_emi,
weight = total_pop),
lm(dem_percent ~ avg_emi_rate + percent_white + percent_hispanic +
percent_bachelors, data = df_2020_combined_emi,
weight = total_pop),
lm(dem_percent ~ avg_emi_rate + avg_wage + poverty + unemp + med_household_income, data = df_2020_combined_emi,
weight = total_pop),
type = "text")
======================================================================================================
Dependent variable:
---------------------------------------------------------------------------------
dem_percent
(1) (2) (3)
------------------------------------------------------------------------------------------------------
avg_emi_rate -0.538*** -0.591*** -0.417***
(0.094) (0.098) (0.140)
percent_white -0.485*** -0.533***
(0.011) (0.010)
percent_hispanic 0.176*** 0.180***
(0.010) (0.010)
percent_bachelors 1.073*** 0.796***
(0.024) (0.015)
avg_wage 0.00000*** 0.00001***
(0.00000) (0.00000)
poverty -0.237*** 1.558***
(0.066) (0.093)
unemp 1.826*** 0.167
(0.161) (0.245)
med_household_income -0.00000*** 0.00000***
(0.00000) (0.00000)
Constant 0.607*** 0.656*** -0.183***
(0.023) (0.012) (0.025)
------------------------------------------------------------------------------------------------------
Observations 3,112 3,113 3,112
R2 0.775 0.741 0.436
Adjusted R2 0.774 0.741 0.435
Residual Std. Error 26.217 (df = 3103) 28.081 (df = 3108) 41.457 (df = 3106)
F Statistic 1,333.636*** (df = 8; 3103) 2,224.033*** (df = 4; 3108) 480.322*** (df = 5; 3106)
======================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
plot_usmap(data = df_2020_combined, values = "avg_immi_rate", size = .1) +
scale_fill_continuous(low = "white", "high" = "black") +
labs(title = "Internal Immigration Rates from 2015-2019")
plot_usmap(data = df_2020_combined_emi, values = "avg_emi_rate", size = .1) +
scale_fill_continuous(low = "white", "high" = "black") +
labs(title = "Internal Emigration Rates from 2015-2019")
stargazer(lm(percent_white ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
lm(percent_hispanic ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
lm(percent_bachelors ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
type = "text")
the condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be usedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changed
================================================================================
Dependent variable:
------------------------------------------------
percent_white percent_hispanic percent_bachelors
(1) (2) (3)
--------------------------------------------------------------------------------
avg_emi_rate 0.478*** -3.221*** 1.322***
(0.170) (0.184) (0.113)
Constant 0.698*** 0.366*** 0.248***
(0.010) (0.011) (0.006)
--------------------------------------------------------------------------------
Observations 3,220 3,220 3,220
R2 0.002 0.087 0.041
Adjusted R2 0.002 0.086 0.041
Residual Std. Error (df = 3218) 53.085 57.426 35.105
F Statistic (df = 1; 3218) 7.883*** 305.654*** 137.843***
================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(lm(percent_white ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
lm(percent_hispanic ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
lm(percent_bachelors ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
type = "text")
the condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be usedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changed
================================================================================
Dependent variable:
------------------------------------------------
percent_white percent_hispanic percent_bachelors
(1) (2) (3)
--------------------------------------------------------------------------------
avg_emi_rate 0.478*** -3.221*** 1.322***
(0.170) (0.184) (0.113)
Constant 0.698*** 0.366*** 0.248***
(0.010) (0.011) (0.006)
--------------------------------------------------------------------------------
Observations 3,220 3,220 3,220
R2 0.002 0.087 0.041
Adjusted R2 0.002 0.086 0.041
Residual Std. Error (df = 3218) 53.085 57.426 35.105
F Statistic (df = 1; 3218) 7.883*** 305.654*** 137.843***
================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
stargazer(lm(avg_wage ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
lm(poverty ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
lm(unemp ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
lm(med_household_income ~ avg_emi_rate, data = df_2020_combined_emi, weight = total_pop),
type = "text")
the condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be usedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changed
==========================================================================================================================
Dependent variable:
------------------------------------------------------------------------------------------------------
avg_wage poverty unemp med_household_income
(1) (2) (3) (4)
--------------------------------------------------------------------------------------------------------------------------
avg_emi_rate 15,384.420 -0.171*** -0.172*** 73,601.550***
(15,432.640) (0.047) (0.013) (18,886.240)
Constant 49,381.010*** 0.133*** 0.047*** 61,268.500***
(889.940) (0.003) (0.001) (1,089.132)
--------------------------------------------------------------------------------------------------------------------------
Observations 3,218 3,141 3,219 3,220
R2 0.0003 0.004 0.052 0.005
Adjusted R2 -0.00000 0.004 0.052 0.004
Residual Std. Error 4,809,552.000 (df = 3216) 14.755 (df = 3139) 4.036 (df = 3217) 5,887,649.000 (df = 3218)
F Statistic 0.994 (df = 1; 3216) 13.012*** (df = 1; 3139) 176.189*** (df = 1; 3217) 15.187*** (df = 1; 3218)
==========================================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
Thoughts - add cost of living interacting with avg_wage?
stargazer(lm(avg_immi_rate ~ avg_wage, data = df_2020_combined, weight = total_pop),
lm(avg_immi_rate ~ poverty, data = df_2020_combined, weight = total_pop),
lm(avg_immi_rate ~ unemp, data = df_2020_combined, weight = total_pop),
lm(avg_immi_rate ~ med_household_income, data = df_2020_combined, weight = total_pop),
type = "text")
the condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be usedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changed
=====================================================================================================================
Dependent variable:
------------------------------------------------------------------------------------------------
avg_immi_rate
(1) (2) (3) (4)
---------------------------------------------------------------------------------------------------------------------
avg_wage -0.00000***
(0.00000)
poverty -0.032***
(0.009)
unemp -0.432***
(0.030)
med_household_income -0.000
(0.00000)
Constant 0.066*** 0.059*** 0.071*** 0.055***
(0.001) (0.001) (0.001) (0.001)
---------------------------------------------------------------------------------------------------------------------
Observations 3,218 3,141 3,219 3,220
R2 0.023 0.004 0.060 0.00001
Adjusted R2 0.022 0.004 0.060 -0.0003
Residual Std. Error 7.204 (df = 3216) 7.305 (df = 3139) 7.062 (df = 3217) 7.285 (df = 3218)
F Statistic 74.375*** (df = 1; 3216) 13.015*** (df = 1; 3139) 207.060*** (df = 1; 3217) 0.040 (df = 1; 3218)
=====================================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
ggplot(df_2020_combined, aes(x = avg_immi_rate, y = percent_white, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Immigration Rate is Higher in Counties with Higher White Proportion",
x = "Immigration Rate",
y = "Proportion White",
subtitle = "2019",
caption = "Weighted by County Population")
ggplot(df_2020_combined, aes(x = avg_immi_rate, y = percent_hispanic, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Immigration Rate is Lower in Counties with Higher Hispanic Proportion",
x = "Immigration Rate",
y = "Proportion Hispanic",
subtitle = "2019",
caption = "Weighted by County Population")
ggplot(df_2020_combined_emi, aes(x = avg_emi_rate, y = percent_white, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Emigration Rate is Higher in Counties with Higher White Proportion",
x = "Emigration Rate",
y = "Proportion White",
subtitle = "2019",
caption = "Weighted by County Population")
ggplot(df_2020_combined_emi, aes(x = avg_emi_rate, y = percent_hispanic, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Emigration Rate is Lower in Counties with Higher Hispanic Proportion",
x = "Emigration Rate",
y = "Proportion Hispanic",
subtitle = "2019",
caption = "Weighted by County Population")
ggplot(df_2020_combined, aes(x = avg_immi_rate, y = percent_bachelors, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Immigration Rate is Higher in Counties with Higher Bachelor's Proportion",
x = "Immigration Rate",
y = "Percent Bachelor's Degree",
subtitle = "2019",
caption = "Weighted by County Population")
ggplot(df_2020_combined_emi, aes(x = avg_emi_rate, y = percent_bachelors, size = total_pop)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm", aes(weight = total_pop), show.legend = FALSE) +
labs(title = "Emigration Rate is Higher in Counties with Higher Bachelor's Proportion",
x = "Emigration Rate",
y = "Percent Bachelor's Degree",
subtitle = "2019",
caption = "Weighted by County Population")